Data Loading and Preparation

# Load integrated dataset with IGO information
integrated_data <- read.csv("integrated_dataset_with_igo.csv", stringsAsFactors = FALSE)

# Load network status dataset with pre-calculated PCAs
network_data <- read.csv("all_network_status_pca.csv", stringsAsFactors = FALSE)

# Check dimensions
dim(integrated_data)
## [1] 8276   41
dim(network_data)
## [1] 2202   18

prepare the data for the attribute-based PCA. 1. Convert any non-numeric values to NA 2. Select appropriate variables for the PCA (excluding cinc as specified) 3. Handle missing values

# Convert "NA" strings to actual NA values
integrated_data <- integrated_data %>%
  mutate(across(everything(), ~ifelse(. == "NA", NA, .)))

# Convert columns to appropriate types
integrated_data <- integrated_data %>%
  mutate(across(c(milex, milper, irst, pec, tpop, upop, 
                 full_memberships, associate_memberships, observer_statuses,
                 total_memberships, prestigious_memberships, regional_memberships,
                 global_memberships, security_memberships, economic_memberships,
                 political_memberships), 
                ~as.numeric(.)))

# Convert political variables
integrated_data <- integrated_data %>%
  mutate(across(c(democ, autoc, polity, polity2, xconst), ~as.numeric(.)))

# Let's look at missing values
missing_vals <- integrated_data %>%
  select(milex, milper, irst, pec, tpop, upop, 
         democ, autoc, polity, polity2, xconst,
         full_memberships, associate_memberships, observer_statuses,
         prestigious_memberships, regional_memberships, global_memberships,
         security_memberships, economic_memberships, political_memberships,
         institutional_status, functional_balance) %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  gather(key = "variable", value = "missing_count")

# Display variables with missing values
missing_vals %>%
  arrange(desc(missing_count)) %>%
  mutate(percentage = missing_count / nrow(integrated_data) * 100) %>%
  filter(missing_count > 0)
##                   variable missing_count percentage
## 1         full_memberships           398  4.8090865
## 2    associate_memberships           398  4.8090865
## 3        observer_statuses           398  4.8090865
## 4  prestigious_memberships           398  4.8090865
## 5     regional_memberships           398  4.8090865
## 6       global_memberships           398  4.8090865
## 7     security_memberships           398  4.8090865
## 8     economic_memberships           398  4.8090865
## 9    political_memberships           398  4.8090865
## 10    institutional_status           398  4.8090865
## 11      functional_balance           398  4.8090865
## 12                 polity2           173  2.0903818
## 13                   democ            82  0.9908168
## 14                   autoc            82  0.9908168
## 15                  polity            82  0.9908168
## 16                  xconst            82  0.9908168

Subset Data for Selected Years

Since the network_data has data only for specific years, we’ll subset the integrated_data for those years:

# Extract years from network data
network_years <- unique(network_data$year)

# Filter integrated data for those years
integrated_subset <- integrated_data %>%
  filter(Year %in% network_years)

# Check dimensions
dim(integrated_subset)
## [1] 1718   41

Attribute-Based PCA

variables related 1. Material capabilities (milex, milper, pec, tpop, upop) 2. Institutional memberships (membership variables) 3. Political indicators (democ, polity2, xconst)

# Select variables for PCA
# We'll avoid using cinc as specified
pca_vars <- c("milex", "milper", "irst", "pec", "tpop", "upop", 
              "full_memberships", "prestigious_memberships", 
              "regional_memberships", "global_memberships",
              "security_memberships", "economic_memberships", "political_memberships",
              "polity2", "xconst")

# Create a dataframe with only the selected variables and complete cases
pca_data <- integrated_subset %>%
  select(Destination, Year, all_of(pca_vars)) %>%
  na.omit()

# Check dimensions
dim(pca_data)
## [1] 1505   17
# Scale the data before PCA
pca_data_scaled <- pca_data %>%
  select(all_of(pca_vars)) %>%
  scale()

# Perform PCA
pca_result <- PCA(pca_data_scaled, graph = FALSE)

# Examine eigenvalues to determine number of components to retain
fviz_eig(pca_result, addlabels = TRUE)

# Visualize variable contributions to first two dimensions
fviz_pca_var(pca_result, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)

# Generate scree plot to determine how many components to retain
fviz_screeplot(pca_result, addlabels = TRUE)

# Print summary of PCA
summary(pca_result)
## 
## Call:
## PCA(X = pca_data_scaled, graph = FALSE) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               5.257   3.621   1.166   1.010   0.794   0.700   0.583
## % of var.             35.049  24.140   7.773   6.734   5.296   4.667   3.889
## Cumulative % of var.  35.049  59.189  66.962  73.696  78.992  83.658  87.547
##                        Dim.8   Dim.9  Dim.10  Dim.11  Dim.12  Dim.13  Dim.14
## Variance               0.489   0.462   0.400   0.238   0.156   0.080   0.044
## % of var.              3.257   3.081   2.668   1.584   1.039   0.533   0.291
## Cumulative % of var.  90.804  93.885  96.553  98.137  99.176  99.709 100.000
##                       Dim.15
## Variance               0.000
## % of var.              0.000
## Cumulative % of var. 100.000
## 
## Individuals (the 10 first)
##                             Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2
## 1                       |  3.483 | -2.770  0.097  0.632 |  1.354  0.034  0.151
## 2                       |  3.132 | -2.508  0.080  0.641 |  1.231  0.028  0.155
## 3                       |  2.899 | -2.341  0.069  0.652 |  1.127  0.023  0.151
## 4                       |  2.847 | -2.289  0.066  0.646 |  1.161  0.025  0.166
## 7                       |  2.702 | -2.175  0.060  0.648 |  1.024  0.019  0.144
## 8                       |  5.742 | -2.172  0.060  0.143 |  0.814  0.012  0.020
## 9                       |  2.572 | -1.812  0.041  0.496 |  1.307  0.031  0.258
## 13                      |  4.727 | -3.331  0.140  0.497 |  1.994  0.073  0.178
## 14                      |  4.792 | -3.384  0.145  0.499 |  2.063  0.078  0.185
## 15                      |  4.689 | -3.868  0.189  0.680 |  2.189  0.088  0.218
##                            Dim.3    ctr   cos2  
## 1                       |  0.230  0.003  0.004 |
## 2                       |  0.169  0.002  0.003 |
## 3                       |  0.092  0.000  0.001 |
## 4                       |  0.051  0.000  0.000 |
## 7                       | -0.005  0.000  0.000 |
## 8                       | -0.436  0.011  0.006 |
## 9                       | -0.181  0.002  0.005 |
## 13                      |  1.739  0.172  0.135 |
## 14                      |  1.763  0.177  0.135 |
## 15                      |  0.847  0.041  0.033 |
## 
## Variables (the 10 first)
##                            Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## milex                   |  0.518  5.112  0.269 |  0.339  3.169  0.115 |  0.609
## milper                  |  0.439  3.661  0.192 |  0.717 14.210  0.515 | -0.080
## irst                    |  0.561  5.995  0.315 |  0.596  9.812  0.355 | -0.004
## pec                     |  0.676  8.694  0.457 |  0.624 10.738  0.389 |  0.276
## tpop                    |  0.466  4.128  0.217 |  0.714 14.066  0.509 | -0.397
## upop                    |  0.566  6.098  0.321 |  0.698 13.461  0.487 | -0.280
## full_memberships        |  0.817 12.687  0.667 | -0.424  4.967  0.180 | -0.168
## prestigious_memberships |  0.768 11.228  0.590 | -0.465  5.970  0.216 | -0.156
## regional_memberships    |  0.624  7.395  0.389 | -0.303  2.542  0.092 |  0.022
## global_memberships      |  0.810 12.493  0.657 | -0.422  4.908  0.178 | -0.172
##                            ctr   cos2  
## milex                   31.841  0.371 |
## milper                   0.546  0.006 |
## irst                     0.001  0.000 |
## pec                      6.540  0.076 |
## tpop                    13.528  0.158 |
## upop                     6.720  0.078 |
## full_memberships         2.411  0.028 |
## prestigious_memberships  2.097  0.024 |
## regional_memberships     0.042  0.000 |
## global_memberships       2.538  0.030 |

Based on the scree plot and eigenvalues, this part will extract the first component as the attribute-based status score:

# Extract PCA scores for the first component
pca_scores <- as.data.frame(pca_result$ind$coord)

# Create a dataframe with country, year, and PCA score
attribute_scores <- cbind(
  pca_data %>% select(Destination, Year),
  attribute_status_pca = pca_scores[, 1]  # First principal component
)

# View the first few rows
head(attribute_scores)
##   Destination Year attribute_status_pca
## 1 Afghanistan 1960            -2.769927
## 2 Afghanistan 1965            -2.508063
## 3 Afghanistan 1970            -2.340794
## 4 Afghanistan 1975            -2.289075
## 7 Afghanistan 1990            -2.175058
## 8 Afghanistan 1995            -2.172125

Merge with Network Status Data

merge the attribute-based PCA scores with the network status data:

# Merge the datasets based on country and year
integrated_status <- network_data %>%
  inner_join(attribute_scores, by = c("country" = "Destination", "year" = "Year"))

# Check the dimensions
dim(integrated_status)
## [1] 1505   19

Combine Status Dimensions

create an overall status score by combining the attribute-based and network-based measures:

# Standardize both PCA scores
integrated_status <- integrated_status %>%
  mutate(
    attribute_status_pca_z = scale(attribute_status_pca)[,1],
    overall_status_network_pca_z = scale(overall_status_network_pca)[,1]
  )

# Create a combined status score (simple average)
integrated_status <- integrated_status %>%
  mutate(combined_status_score = (attribute_status_pca_z + overall_status_network_pca_z) / 2)

# Create status quartiles
integrated_status <- integrated_status %>%
  group_by(year) %>%
  mutate(
    attribute_status_quartile = ntile(attribute_status_pca, 4),
    network_status_quartile = ntile(overall_status_network_pca, 4),
    combined_status_quartile = ntile(combined_status_score, 4)
  ) %>%
  ungroup()

# Create status types
integrated_status <- integrated_status %>%
  mutate(
    status_type = case_when(
      combined_status_quartile == 4 ~ "High Status",
      combined_status_quartile == 3 ~ "Upper-Middle Status",
      combined_status_quartile == 2 ~ "Lower-Middle Status",
      combined_status_quartile == 1 ~ "Low Status"
    )
  )

# Calculate status inconsistency (difference between attribute and network status)
integrated_status <- integrated_status %>%
  mutate(status_inconsistency = abs(attribute_status_pca_z - overall_status_network_pca_z))

# View the final integrated dataset
head(integrated_status)
## # A tibble: 6 × 27
##   country recognition_count weighted_recognition eigenvector_centrality pagerank
##   <chr>               <int>                <dbl>                  <dbl>    <dbl>
## 1 Czecho…                39                 49.6                  0.528   0.0125
## 2 Egypt                  58                 60.2                  0.710   0.0196
## 3 France                 82                 75.5                  1       0.0379
## 4 India                  55                 43.8                  0.660   0.0205
## 5 Indone…                33                 31.8                  0.490   0.0114
## 6 Iran                   36                 31.5                  0.540   0.0114
## # ℹ 22 more variables: authority <dbl>, betweenness <dbl>,
## #   recognition_rate <dbl>, network_inconsistency <dbl>, outgoing_ties <int>,
## #   recognition_balance <int>, recognition_ratio <dbl>,
## #   recognition_status_pca <dbl>, prestige_status_pca <dbl>,
## #   brokerage_status_pca <dbl>, overall_status_network_pca <dbl>, year <int>,
## #   external_internal_ratio <dbl>, attribute_status_pca <dbl>,
## #   attribute_status_pca_z <dbl>, overall_status_network_pca_z <dbl>, …

Visualization: Status Dimensions

creating visualizations to explore the status dimensions:

# Create a scatter plot of attribute status vs. network status
ggplot(integrated_status, aes(x = attribute_status_pca, y = overall_status_network_pca)) +
  geom_point(aes(color = status_type)) +
  geom_text_repel(data = subset(integrated_status, 
                               attribute_status_pca > quantile(attribute_status_pca, 0.9) | 
                               overall_status_network_pca > quantile(overall_status_network_pca, 0.9)),
                 aes(label = country), max.overlaps = 15) +
  facet_wrap(~ year) +
  labs(x = "Attribute-Based Status", y = "Network-Based Status",
       title = "Status Dimensions by Year",
       color = "Status Type") +
  theme_minimal()

Cluster Analysis

perform cluster analysis to identify status groups:

# Prepare data for clustering
cluster_data <- integrated_status %>%
  select(attribute_status_pca, overall_status_network_pca)

# Scale the data
cluster_data_scaled <- scale(cluster_data)

# Determine optimal number of clusters
fviz_nbclust(cluster_data_scaled, kmeans, method = "silhouette")

fviz_nbclust(cluster_data_scaled, kmeans, method = "wss")

# Perform k-means clustering with the optimal number of clusters (from plots above)
# Let's assume the optimal number is 4 based on the previous plots
k <- 4
kmeans_result <- kmeans(cluster_data_scaled, centers = k, nstart = 25)

# Add cluster assignments to the data
integrated_status$cluster <- as.factor(kmeans_result$cluster)

# Visualize the clusters
ggplot(integrated_status, aes(x = attribute_status_pca, y = overall_status_network_pca, color = cluster)) +
  geom_point(alpha = 0.7) +
  geom_text_repel(data = subset(integrated_status, 
                               attribute_status_pca > quantile(attribute_status_pca, 0.9) | 
                               overall_status_network_pca > quantile(overall_status_network_pca, 0.9)),
                 aes(label = country), max.overlaps = 15) +
  facet_wrap(~ year) +
  labs(x = "Attribute-Based Status", y = "Network-Based Status",
       title = "Status Clusters by Year") +
  theme_minimal()

# Compare cluster assignments with status quartiles
table(integrated_status$cluster, integrated_status$combined_status_quartile)
##    
##       1   2   3   4
##   1 380 349  44   0
##   2   0   0   0  15
##   3   0  29 332 158
##   4   0   0   0 198

Hierarchical Clustering

hierarchical clustering:

# Compute distance matrix
dist_matrix <- dist(cluster_data_scaled)

# Perform hierarchical clustering
hc_result <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
plot(hc_result, main = "Hierarchical Clustering Dendrogram", 
     xlab = "", sub = "", labels = FALSE)

# Cut the dendrogram to create the desired number of clusters
hc_clusters <- cutree(hc_result, k = 4)

# Add hierarchical cluster assignments to the data
integrated_status$hc_cluster <- as.factor(hc_clusters)

# Visualize the hierarchical clusters
ggplot(integrated_status, aes(x = attribute_status_pca, y = overall_status_network_pca, color = hc_cluster)) +
  geom_point(alpha = 0.7) +
  geom_text_repel(data = subset(integrated_status, 
                               attribute_status_pca > quantile(attribute_status_pca, 0.9) | 
                               overall_status_network_pca > quantile(overall_status_network_pca, 0.9)),
                 aes(label = country), max.overlaps = 15) +
  facet_wrap(~ year) +
  labs(x = "Attribute-Based Status", y = "Network-Based Status",
       title = "Hierarchical Status Clusters by Year") +
  theme_minimal()

Status Mobility Analysis

# Create a wide format dataset to analyze status changes
status_over_time <- integrated_status %>%
  select(country, year, combined_status_score, status_type) %>%
  pivot_wider(
    names_from = year,
    values_from = c(combined_status_score, status_type),
    names_sep = "_"
  )

# Calculate status change for countries present in multiple years
status_change <- integrated_status %>%
  select(country, year, combined_status_score) %>%
  arrange(country, year) %>%
  group_by(country) %>%
  mutate(
    prev_year = lag(year),
    prev_score = lag(combined_status_score),
    status_change = combined_status_score - prev_score
  ) %>%
  filter(!is.na(status_change)) %>%
  ungroup()

# Visualize status changes
ggplot(status_change, aes(x = year, y = status_change, group = country)) +
  geom_line(alpha = 0.3) +
  geom_point(alpha = 0.3) +
  geom_text_repel(data = subset(status_change, 
                              abs(status_change) > quantile(abs(status_change), 0.9, na.rm = TRUE)),
                 aes(label = country), max.overlaps = 15) +
  labs(x = "Year", y = "Status Change",
       title = "Status Mobility Over Time") +
  theme_minimal()

Export the Integrated Dataset

# Export the final integrated dataset
write.csv(integrated_status, "integrated_status_dimensions.csv", row.names = FALSE)